El objetivo de este trabajo prΓ‘ctico es aplicar tΓ©cnicas de anΓ‘lisis exploratorio y visualizaciΓ³n de datos sobre un conjunto de datos real, utilizando buenas prΓ‘cticas vistas en la materia. Los estudiantes deberΓ‘n desarrollar habilidades para extraer, interpretar y comunicar informaciΓ³n relevante a travΓ©s de grΓ‘ficos eficaces.
Utilizando el dataset listings.csv de Airbnb Buenos Aires, deberΓ‘n:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
import folium
from folium.plugins import HeatMap
from IPython.display import display, Markdown
import matplotlib.cm as cm
import matplotlib.colors as colors
import altair as alt
import pandas as pd
plt.rcParams['figure.figsize'] = (12, 8)
Analizar los atributos disponibles: tipos de datos, valores nulos, variables categΓ³ricas y numΓ©ricas, etc.
listings_df = pd.read_csv('listings.csv')
reference_df = pd.read_csv('reference.csv')
print("Dataset 'listings_df.csv':\n")
print(f"Forma del dataset: {listings_df.shape}\n")
print(f"Columnas: {list(listings_df.columns)}\n")
print("Primeras 5 filas:")
listings_df.head()
print("Dataset 'reference.csv':\n")
print(f"Forma del dataset: {reference_df.shape}\n")
print(f"Columnas: {list(reference_df.columns)}\n")
print("Filas:")
reference_df
print("Dimensiones del dataset")
print(f"{listings_df.shape[0]:,} filas y {listings_df.shape[1]} columnas")
print("\nInformaciΓ³n de tipos de datos\n")
listings_df.info()
print("\nEstadΓsticas descriptivas\n")
display(listings_df.describe())
A su vez las variables numΓ©ricas y categΓ³ricas se las clasifica segΓΊn las escalas de mediciΓ³n planteadas por S.S. Stevens:
| NΒΊ | Variable | Tipo de Variable | Subtipo |
|---|---|---|---|
| 0 | id | NumΓ©rica | Discreta |
| 1 | name | CategΓ³rica | Nominal |
| 2 | host_id | NumΓ©rica | Discreta |
| 3 | host_name | CategΓ³rica | Nominal |
| 4 | neighbourhood_group | CategΓ³rica | Nominal |
| 5 | neighbourhood | CategΓ³rica | Nominal |
| 6 | latitude | NumΓ©rica | Continua |
| 7 | longitude | NumΓ©rica | Continua |
| 8 | room_type | CategΓ³rica | Nominal |
| 9 | price | NumΓ©rica | Continua |
| 10 | minimum_nights | NumΓ©rica | Discreta |
| 11 | number_of_reviews | NumΓ©rica | Discreta |
| 12 | last_review | CategΓ³rica | Nominal |
| 13 | reviews_per_month | NumΓ©rica | Continua |
| 14 | calculated_host_listings_count | NumΓ©rica | Discreta |
| 15 | availability_365 | NumΓ©rica | Discreta |
| 16 | number_of_reviews_ltm | NumΓ©rica | Discreta |
| 17 | license | CategΓ³rica | Nominal |
print("DescripciΓ³n de las columnas:")
column_descriptions = dict(zip(reference_df['Field'], reference_df['Description']))
for col in listings_df.columns:
if col in column_descriptions:
print(f"β’ {col}: {column_descriptions[col]}")
else:
print(f"β’ {col}: (No hay descripciΓ³n disponible)")
print("AnΓ‘lisis de la cantidad valores nulos por columna:")
# AnΓ‘lisis de valores nulos por columna
null_analysis = pd.DataFrame({
'Columna': listings_df.columns,
'Valores_Nulos': listings_df.isnull().sum(),
'Porcentaje_Nulos': (listings_df.isnull().sum() / len(listings_df)) * 100
}).sort_values('Porcentaje_Nulos', ascending=False)
# Mostrar tabla con columnas que tienen valores nulos
display(null_analysis[null_analysis['Valores_Nulos'] > 0])
# Preparar datos para grΓ‘ficos
null_counts = listings_df.isnull().sum()
null_counts = null_counts[null_counts > 0].sort_values(ascending=False)
total_columns = listings_df.shape[1]
columns_with_nulls = (listings_df.isnull().sum() > 0).sum()
columns_without_nulls = total_columns - columns_with_nulls
# Crear figura con dos subplots horizontales
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
# GrΓ‘fico de barras
sns.barplot(
x=null_counts.values.tolist(),
y=null_counts.index.tolist(),
ax=axes[0],
palette="Reds_d"
)
axes[0].set_title('Cantidad de Valores Nulos por Columna')
axes[0].set_xlabel('NΓΊmero de Valores Nulos')
# GrΓ‘fico de torta
axes[1].pie(
[columns_with_nulls, columns_without_nulls],
labels=['Con Valores Nulos', 'Sin Valores Nulos'],
autopct='%1.1f%%',
colors=['#e74c3c', '#2ecc71'],
startangle=140,
explode=(0.05, 0)
)
axes[1].set_title('Porcentaje de Columnas con/sin Valores Nulos')
plt.tight_layout()
plt.show()
Detectar patrones generales, distribuciones, relaciones entre variables y outliers.
duplicates = listings_df.duplicated().sum()
print(f"\nNΓΊmero de filas duplicadas: {duplicates}")
print("\nCambio de formato para columna: 'last_review' de object a datetime\n")
df_clean = listings_df.copy()
df_clean['last_review'] = pd.to_datetime(df_clean['last_review'], errors='coerce')
print(df_clean.dtypes)
# AnΓ‘lisis de variables numΓ©ricas
print("AnΓ‘lisis de variables numΓ©ricas\n")
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
print(f"Variables numΓ©ricas: {list(numeric_cols)}")
# EstadΓsticas descriptivas
print("\nEstadΓsticas Descriptivas:")
display(df_clean[numeric_cols].describe())
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
corr_data = df_clean[numeric_cols].corr()
print("\nMatriz de CorrelaciΓ³n entre Variables NumΓ©ricas:")
plt.figure(figsize=(12, 10))
sns.heatmap(corr_data, annot=True, cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.tight_layout()
plt.show()
price_corr = corr_data['price'].abs().sort_values(ascending=False)
print("AnΓ‘lisis de Precios:\n")
# Filtrar precios vΓ‘lidos (no nulos y mayores a 0)
valid_prices = df_clean['price'].dropna()
valid_prices = valid_prices[valid_prices > 0]
print(f"Precio promedio: ${valid_prices.mean():,.2f}")
print(f"Precio mediano: ${valid_prices.median():,.2f}")
print(f"Precio mΓnimo: ${valid_prices.min():,.2f}")
print(f"Precio mΓ‘ximo: ${valid_prices.max():,.2f}")
print(f"DesviaciΓ³n estΓ‘ndar: ${valid_prices.std():,.2f}")
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.hist(valid_prices.values, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('DistribuciΓ³n de Precios')
plt.xlabel('Precio ($)')
plt.ylabel('Frecuencia')
plt.subplot(2, 2, 2)
plt.boxplot(valid_prices.values)
plt.title('Boxplot de Precios')
plt.ylabel('Precio ($)')
plt.subplot(2, 2, 3)
q95 = valid_prices.quantile(0.95)
filtered_prices = valid_prices[valid_prices <= q95]
plt.hist(filtered_prices.values, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
plt.title('DistribuciΓ³n de Precios (sin 5% superior)')
plt.xlabel('Precio ($)')
plt.ylabel('Frecuencia')
plt.subplot(2, 2, 4)
if 'room_type' in df_clean.columns:
df_price_room = df_clean[df_clean['price'].notna() & (df_clean['price'] > 0)]
sns.boxplot(data=df_price_room, x='room_type', y='price')
plt.title('Precios por Tipo de HabitaciΓ³n')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print("AnΓ‘lisis de Outliers:")
def detect_outliers_iqr(data, column):
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
return outliers, lower_bound, upper_bound
key_numeric_cols = ['price', 'minimum_nights', 'number_of_reviews', 'availability_365']
key_numeric_cols = [col for col in key_numeric_cols if col in df_clean.columns]
for col in key_numeric_cols:
if df_clean[col].notna().any():
clean_data = df_clean.dropna(subset=[col])
if col == 'price':
clean_data = clean_data[clean_data[col] > 0] # Filtrar precios vΓ‘lidos
outliers, lower, upper = detect_outliers_iqr(clean_data, col)
print(f"\n{col.upper()}:")
print(f" Rango normal: {lower:.2f} - {upper:.2f}")
print(f" Outliers detectados: {len(outliers)} ({len(outliers)/len(clean_data)*100:.1f}%)")
if len(outliers) > 0:
print(f" Valores extremos: {clean_data[col].min():.2f} - {clean_data[col].max():.2f}")
host_data = df_clean['calculated_host_listings_count'].dropna()
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.hist(host_data.values, bins=30, alpha=0.7, color='purple', edgecolor='black')
plt.title('DistribuciΓ³n de Listados por Host')
plt.xlabel('NΓΊmero de Listados por Host')
plt.ylabel('Frecuencia')
plt.subplot(1, 2, 2)
filtered_host_data = host_data[host_data <= host_data.quantile(0.95)]
plt.hist(filtered_host_data.values, bins=20, alpha=0.7, color='mediumpurple', edgecolor='black')
plt.title('DistribuciΓ³n de Listados por Host (sin 5% superior)')
plt.xlabel('NΓΊmero de Listados por Host')
plt.ylabel('Frecuencia')
plt.tight_layout()
plt.show()
print("Top 10 Hosts con MΓ‘s Departamentos en Alquiler:\n")
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
top_hosts = df_clean['host_id'].value_counts().head(10)
plt.figure(figsize=(10, 6))
plt.barh(top_hosts.index.astype(str), top_hosts.values, color='gray')
plt.xlabel("Cantidad de Departamentos")
plt.ylabel("Host ID")
plt.title("Top 10 Hosts con MΓ‘s Departamentos")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
print("DistribuciΓ³n de Tipos de HabitaciΓ³n:\n")
room_counts = df_clean['room_type'].value_counts()
total = room_counts.sum()
percentages = (room_counts / total * 100).round(1)
plt.figure(figsize=(10, 6))
sns.barplot(x=room_counts.values, y=room_counts.index, palette='Set2')
for i, (count, pct) in enumerate(zip(room_counts.values, percentages.values)):
label = f"{count} ({pct}%)"
plt.text(count + total * 0.01, i, label, va='center', fontweight='bold')
plt.xlabel('Cantidad de Alojamientos')
plt.ylabel('Tipo de HabitaciΓ³n')
plt.tight_layout()
plt.show()
print(f"Total de barrios ΓΊnicos: {df_clean['neighbourhood'].nunique()}\n")
top_neighbourhoods = df_clean['neighbourhood'].value_counts().head(15)
print("Top 15 barrios con mΓ‘s alojamientos:\n")
plt.figure(figsize=(15, 8))
sns.barplot(x=top_neighbourhoods.values, y=top_neighbourhoods.index.tolist())
plt.xlabel('NΓΊmero de alojamientos')
plt.ylabel('Barrio')
for i, v in enumerate(top_neighbourhoods.values):
plt.text(v + 10, i, str(v), va='center')
plt.tight_layout()
plt.show()
print("AnΓ‘lisis de ReseΓ±as\n")
reviews_data = df_clean['number_of_reviews'].dropna()
print(f"Promedio de reseΓ±as por alojamiento: {reviews_data.mean():.2f}")
print(f"Mediana de reseΓ±as: {reviews_data.median():.2f}")
print(f"Alojamientos sin reseΓ±as: {(reviews_data == 0).sum()} ({(reviews_data == 0).mean()*100:.1f}%)")
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.hist(reviews_data.values, bins=50, alpha=0.7, color='orange', edgecolor='black')
plt.title('DistribuciΓ³n del NΓΊmero de ReseΓ±as')
plt.xlabel('NΓΊmero de ReseΓ±as')
plt.ylabel('Frecuencia')
plt.tight_layout()
plt.show()
print("AnΓ‘lisis GeogrΓ‘fico\n")
geo_data = df_clean.dropna(subset=['latitude', 'longitude'])
print(f"Rango de latitud: {geo_data['latitude'].min():.4f} a {geo_data['latitude'].max():.4f}")
print(f"Rango de longitud: {geo_data['longitude'].min():.4f} a {geo_data['longitude'].max():.4f}")
geo_price_data = geo_data.dropna(subset=['price'])
geo_price_data = geo_price_data[geo_price_data['price'] > 0]
p5 = geo_price_data['price'].quantile(0.05)
p95 = geo_price_data['price'].quantile(0.95)
geo_price_data = geo_price_data[(geo_price_data['price'] >= p5) & (geo_price_data['price'] <= p95)]
center_lat = geo_price_data['latitude'].mean()
center_lon = geo_price_data['longitude'].mean()
mapa = folium.Map(
location=[center_lat, center_lon],
zoom_start=12,
tiles='CartoDB positron'
)
precios_norm = (geo_price_data['price'] - geo_price_data['price'].min()) / (geo_price_data['price'].max() - geo_price_data['price'].min())
# Crear lista de puntos [lat, lon, intensidad]
heat_data = []
for idx, row in geo_price_data.iterrows():
# Usar precio normalizado como intensidad
intensidad = precios_norm[idx]
heat_data.append([row['latitude'], row['longitude'], intensidad])
# Agregar mapa de calor
HeatMap(
heat_data,
min_opacity=0.3, # Opacidad mΓnima aumentada para mejor visibilidad
max_zoom=18, # Zoom mΓ‘ximo
radius=20, # Radio de influencia aumentado
blur=15, # Efecto de difuminado aumentado
gradient={ # Gradiente de colores mejorado
0.0: 'darkblue', # Precios mΓ‘s bajos (P5)
0.2: 'blue',
0.4: 'cyan',
0.6: 'lime',
0.8: 'yellow',
0.9: 'orange',
1.0: 'red' # Precios mΓ‘s altos (P95)
}
).add_to(mapa)
legend_html = '''
<div style="position: fixed;
bottom: 50px; left: 50px; width: 240px; height: 180px;
background-color: white; border:3px solid #333; z-index:9999;
font-size:14px; padding: 15px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.3);">
<p style="margin: 0 0 10px 0; font-weight: bold; font-size: 16px;">π₯ Mapa de Calor - Precios</p>
<p style="margin: 0 0 8px 0; font-size: 12px; color: #666;">Filtrado entre P5 - P95</p>
<div style="background: linear-gradient(to right, darkblue, blue, cyan, lime, yellow, orange, red); height: 20px; width: 100%; margin: 10px 0; border-radius: 5px;"></div>
<div style="display: flex; justify-content: space-between; font-size: 11px; margin-bottom: 8px;">
<span>Bajo</span>
<span>Medio</span>
<span>Alto</span>
</div>
<p style="margin: 5px 0; font-size: 12px;">
<b>Propiedades:</b> ''' + str(len(geo_price_data)) + '''
</p>
</div>
'''
mapa.get_root().html.add_child(folium.Element(legend_html))
display(mapa)
geo_price_data = df_clean.dropna(subset=['latitude', 'longitude', 'price'])
geo_price_data = geo_price_data[geo_price_data['price'] > 0]
p95 = geo_price_data['price'].quantile(0.95)
p05 = geo_price_data['price'].quantile(0.05)
geo_price_data = geo_price_data[(geo_price_data['price'] <= p95) & (geo_price_data['price'] >= p05)]
center_lat = geo_price_data['latitude'].mean()
center_lon = geo_price_data['longitude'].mean()
mapa = folium.Map(location=[center_lat, center_lon], zoom_start=12, tiles='CartoDB positron')
norm = colors.Normalize(vmin=geo_price_data['price'].min(), vmax=geo_price_data['price'].max())
cmap = cm.get_cmap('YlOrRd')
sample_data = geo_price_data.sample(min(500, len(geo_price_data)))
def generar_tooltip(row):
return (
f"<div style='min-width: 220px;'>"
f"<b>Nombre:</b> <span style='font-weight:bold'>{row['name']}</span><br>"
f"<b>Precio:</b> <span style='font-weight:bold'>${row['price']:,.0f}</span><br>"
f"<b>Barrio:</b> <span style='font-weight:bold'>{row['neighbourhood']}</span><br>"
f"<b>Tipo:</b> <span style='font-weight:bold'>{row['room_type']}</span>"
f"</div>"
)
for _, row in sample_data.iterrows():
color = colors.to_hex(cmap(norm(row['price'])))
folium.CircleMarker(
location=[row['latitude'], row['longitude']],
radius=5,
color=color,
fill=True,
fillColor=color,
fill_opacity=0.8,
tooltip=folium.Tooltip(generar_tooltip(row), sticky=True, direction='top')
).add_to(mapa)
# Marcador de precio mΓnimo
min_row = geo_price_data.loc[geo_price_data['price'].idxmin()]
folium.Marker(
location=[min_row['latitude'], min_row['longitude']],
icon=folium.Icon(color='green', icon='arrow-down', prefix='fa'),
tooltip=folium.Tooltip(generar_tooltip(min_row), sticky=True, direction='top')
).add_to(mapa)
# Marcador de precio mΓ‘ximo
max_row = geo_price_data.loc[geo_price_data['price'].idxmax()]
folium.Marker(
location=[max_row['latitude'], max_row['longitude']],
icon=folium.Icon(color='red', icon='arrow-up', prefix='fa'),
tooltip=folium.Tooltip(generar_tooltip(max_row), sticky=True, direction='top')
).add_to(mapa)
min_price = int(geo_price_data['price'].min())
max_price = int(geo_price_data['price'].max())
legend_html = f'''
<div style="position: fixed;
bottom: 50px; left: 50px; width: 240px; height: 130px;
background-color: white; border:3px solid #333; z-index:9999;
font-size:14px; padding: 10px 15px; border-radius: 10px; box-shadow: 0 4px 8px rgba(0,0,0,0.3);">
<p style="margin: 0 0 8px 0; font-weight: bold; font-size: 15px;">π° Escala de Precios por Noche</p>
<p style="margin: 0 0 8px 0; font-size: 12px; color: #666;">Filtrado entre P5 - P95</p>
<div style="display: flex; align-items: center;">
<span style="flex: 1; text-align: left;">${min_price:,.0f}</span>
<div style="flex: 6; height: 15px; background: linear-gradient(to right, #ffffcc, #ffeda0, #feb24c, #f03b20); margin: 0 10px;"></div>
<span style="flex: 1; text-align: right;">${max_price:,.0f}</span>
</div>
</div>
'''
mapa.get_root().html.add_child(folium.Element(legend_html))
display(mapa)
Las preguntas que surgen del anΓ‘lisis exploratorio son:
price_neighbourhood = df_clean.dropna(subset=['price', 'neighbourhood'])
price_neighbourhood = price_neighbourhood[price_neighbourhood['price'] > 0]
p5 = price_neighbourhood['price'].quantile(0.05)
p95 = price_neighbourhood['price'].quantile(0.95)
print(f" Filtrado de precios por barrio:")
print(f"Percentil 5 (mΓnimo): ${p5:.2f}")
print(f"Percentil 95 (mΓ‘ximo): ${p95:.2f}")
print(f"Propiedades antes del filtro: {len(price_neighbourhood):,}")
price_neighbourhood = price_neighbourhood[(price_neighbourhood['price'] >= p5) & (price_neighbourhood['price'] <= p95)]
print(f"Propiedades despuΓ©s del filtro: {len(price_neighbourhood):,}")
neighbourhood_stats = price_neighbourhood.groupby('neighbourhood').agg({
'price': ['mean', 'median', 'count']
}).round(2)
neighbourhood_stats.columns = ['precio_promedio', 'precio_mediano', 'cantidad_listados']
neighbourhood_stats = neighbourhood_stats[neighbourhood_stats['cantidad_listados'] >= 5]
neighbourhood_stats = neighbourhood_stats.sort_values('precio_promedio', ascending=False)
print("\nTOP 10 BARRIOS MΓS CAROS (P5-P95):")
print(neighbourhood_stats.head(10))
print("\nTOP 10 BARRIOS MΓS BARATOS (P5-P95):")
print(neighbourhood_stats.tail(10))
print(f"\n ESTADΓSTICAS GENERALES (P5-P95):")
print(f"Barrios analizados: {len(neighbourhood_stats)}")
print(f"Precio promedio general: ${price_neighbourhood['price'].mean():.2f}")
print(f"Precio mediano general: ${price_neighbourhood['price'].median():.2f}")
print(f"Rango de precios: ${p5:.2f} - ${p95:.2f}")
plt.figure(figsize=(15, 12))
plt.subplot(2, 1, 1)
top_expensive = neighbourhood_stats.head(10)
sns.barplot(x=top_expensive['precio_promedio'].values, y=top_expensive.index.tolist())
plt.title('Top 10 Barrios MΓ‘s Caros (Precio Promedio)')
plt.xlabel('Precio Promedio ($)')
plt.subplot(2, 1, 2)
top_cheap = neighbourhood_stats.tail(10)
sns.barplot(x=top_cheap['precio_promedio'].values, y=top_cheap.index.tolist())
plt.title('Top 10 Barrios MΓ‘s Baratos (Precio Promedio)')
plt.xlabel('Precio Promedio ($)')
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 14))
plt.subplot(2, 1, 1)
top_expensive_names = neighbourhood_stats.head(10).index.tolist()
top_expensive_data = price_neighbourhood[price_neighbourhood['neighbourhood'].isin(top_expensive_names)]
order_expensive = neighbourhood_stats.head(10).index.tolist()
sns.boxplot(data=top_expensive_data, x='price', y='neighbourhood',
order=order_expensive, orient='h')
plt.title('Top 10 Barrios MΓ‘s Caros - DistribuciΓ³n de Precios (P5-P95)', fontsize=14, fontweight='bold')
plt.xlabel('Precio ($)')
plt.ylabel('Barrio')
plt.axvline(price_neighbourhood['price'].mean(), color='blue', linestyle='--', alpha=0.7, label='Promedio General')
plt.axvline(price_neighbourhood['price'].median(), color='green', linestyle='--', alpha=0.7, label='Mediana General')
plt.legend()
plt.subplot(2, 1, 2)
top_cheap_names = neighbourhood_stats.tail(10).index.tolist()
top_cheap_data = price_neighbourhood[price_neighbourhood['neighbourhood'].isin(top_cheap_names)]
order_cheap = neighbourhood_stats.tail(10).sort_values('precio_promedio', ascending=True).index.tolist()
sns.boxplot(data=top_cheap_data, x='price', y='neighbourhood',
order=order_cheap, orient='h')
plt.title('Top 10 Barrios MΓ‘s Baratos - DistribuciΓ³n de Precios (P5-P95)', fontsize=14, fontweight='bold')
plt.xlabel('Precio ($)')
plt.ylabel('Barrio')
plt.axvline(price_neighbourhood['price'].mean(), color='blue', linestyle='--', alpha=0.7, label='Promedio General')
plt.axvline(price_neighbourhood['price'].median(), color='green', linestyle='--', alpha=0.7, label='Mediana General')
plt.legend()
plt.tight_layout()
plt.show()
p5 = price_neighbourhood['price'].quantile(0.05)
p95 = price_neighbourhood['price'].quantile(0.95)
plt.axvline(p5, color='red', linestyle='--', linewidth=2, label='P5')
plt.axvline(p95, color='red', linestyle='--', linewidth=2, label='P95')
sns.distplot(palermo_filtered['price'], bins=30, kde=True, color='gray')
plt.text(p5, plt.ylim()[1]*0.8, 'P5', color='red', ha='left', fontsize=14, fontweight='bold')
plt.text(p95, plt.ylim()[1]*0.8, 'P95', color='red', ha='left', fontsize=14, fontweight='bold')
plt.title('Histograma de precios en Palermo (P5βP95)')
plt.xlabel('Precio ($)')
plt.ylabel('Cantidad de alojamientos')
plt.tight_layout()
plt.show()
review_price = df_clean.dropna(subset=['price', 'number_of_reviews'])
review_price = review_price[review_price['price'] > 0]
correlation = review_price['price'].corr(review_price['number_of_reviews'])
review_price['review_category'] = pd.cut(
review_price['number_of_reviews'],
bins=[-1, 0, 10, 50, 100, float('inf')], # Cambiado: -1 en lugar del primer 0
labels=['Sin reseΓ±as', '1-10 reseΓ±as', '11-50 reseΓ±as', '51-100 reseΓ±as', '100+ reseΓ±as']
)
price_by_reviews = review_price.groupby('review_category')['price'].agg(['mean', 'median', 'count'])
print("\n Precio promedio por categorΓa de reseΓ±as:")
print(price_by_reviews)
print("\n CorrelaciΓ³n:")
print(correlation)
plt.figure(figsize=(16, 12))
plt.subplot(2, 2, 1)
plt.scatter(review_price['number_of_reviews'].values, review_price['price'].values, alpha=0.5, s=30)
plt.xlabel('NΓΊmero de ReseΓ±as')
plt.ylabel('Precio ($)')
plt.title(f'RelaciΓ³n Precio vs ReseΓ±as (r={correlation:.3f})', fontweight='bold')
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 2)
sns.boxplot(data=review_price, x='review_category', y='price', palette='Set2')
plt.xticks(rotation=45)
plt.title('DistribuciΓ³n de Precios por CategorΓa de ReseΓ±as', fontweight='bold')
plt.ylabel('Precio ($)')
plt.subplot(2, 2, 3)
q99_price = review_price['price'].quantile(0.99)
q99_reviews = review_price['number_of_reviews'].quantile(0.99)
filtered_data = review_price[
(review_price['price'] <= q99_price) &
(review_price['number_of_reviews'] <= q99_reviews)
]
filtered_correlation = filtered_data['price'].corr(filtered_data['number_of_reviews'])
plt.scatter(filtered_data['number_of_reviews'].values, filtered_data['price'].values,
alpha=0.6, color='green', s=25)
plt.xlabel('NΓΊmero de ReseΓ±as')
plt.ylabel('Precio ($)')
plt.title(f'Sin Outliers Extremos (r={filtered_correlation:.3f})', fontweight='bold')
plt.grid(True, alpha=0.3)
plt.subplot(2, 2, 4)
avg_prices = price_by_reviews['mean']
colors = ['lightcoral', 'lightblue', 'lightgreen', 'gold', 'plum']
bars = plt.bar(range(len(avg_prices)), avg_prices.values, color=colors)
for i, (bar, value) in enumerate(zip(bars, avg_prices.values)):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + value*0.01,
f'${value:.0f}', ha='center', va='bottom', fontweight='bold')
plt.xticks(range(len(avg_prices)), avg_prices.index.tolist(), rotation=45)
plt.title('Precio Promedio por CategorΓa de ReseΓ±as', fontweight='bold')
plt.ylabel('Precio Promedio ($)')
plt.tight_layout()
plt.show()
# InformaciΓ³n adicional
print(f"\n ANΓLISIS DETALLADO:")
print(f"CorrelaciΓ³n completa: {correlation:.4f}")
print(f"CorrelaciΓ³n sin outliers: {filtered_correlation:.4f}")
print(f"Total de propiedades analizadas: {len(review_price):,}")
print(f"Propiedades sin outliers: {len(filtered_data):,}")
print(f"\n INSIGHTS POR CATEGORΓA:")
for category in price_by_reviews.index:
mean_price = price_by_reviews.loc[category, 'mean']
count = price_by_reviews.loc[category, 'count']
median_price = price_by_reviews.loc[category, 'median']
print(f"{category}:")
print(f" β’ Precio promedio: ${mean_price:.2f}")
print(f" β’ Precio mediano: ${median_price:.2f}")
print(f" β’ Cantidad de listados: {count:,}")
print(f" β’ Porcentaje del total: {(count/len(review_price)*100):.1f}%")
# Calcular percentiles
p5_price, p95_price = price_neighbourhood['price'].quantile([0.05, 0.95])
p5_reviews, p95_reviews = price_neighbourhood['number_of_reviews'].quantile([0.05, 0.95])
# Filtrar por precio (P5-P95)
filtered_price = price_neighbourhood[
(price_neighbourhood['price'] >= p5_price) & (price_neighbourhood['price'] <= p95_price)
]
# Filtrar por precio y nΓΊmero de reseΓ±as (P5-P95)
filtered_price_reviews = price_neighbourhood[
(price_neighbourhood['price'] >= p5_price) & (price_neighbourhood['price'] <= p95_price) &
(price_neighbourhood['number_of_reviews'] >= p5_reviews) & (price_neighbourhood['number_of_reviews'] <= p95_reviews)
]
# Crear los subplots con tamaΓ±o mΓ‘s grande
fig, axs = plt.subplots(3, 1, figsize=(12, 20))
# ConfiguraciΓ³n comΓΊn para fuentes mΓ‘s grandes
title_font = 16
label_font = 14
tick_font = 12
# GrΓ‘fico original
sns.scatterplot(
data=price_neighbourhood,
x='number_of_reviews',
y='price',
alpha=0.4,
s=50, # tamaΓ±o de los puntos
ax=axs[0]
)
axs[0].set_title('RelaciΓ³n NΓΊmero de ReseΓ±as y Precio (Original)', fontsize=title_font)
axs[0].set_xlabel('NΓΊmero de ReseΓ±as', fontsize=label_font)
axs[0].set_ylabel('Precio ($)', fontsize=label_font)
axs[0].tick_params(labelsize=tick_font)
# GrΓ‘fico filtrado (sin outliers en price y reviews)
sns.scatterplot(
data=filtered_price_reviews,
x='number_of_reviews',
y='price',
alpha=0.4,
color='green',
s=50,
ax=axs[1]
)
axs[1].set_title('RelaciΓ³n ReseΓ±as y Precio (P5-P95 en Precio y ReseΓ±as)', fontsize=title_font)
axs[1].set_xlabel('NΓΊmero de ReseΓ±as', fontsize=label_font)
axs[1].set_ylabel('Precio ($)', fontsize=label_font)
axs[1].tick_params(labelsize=tick_font)
# GrΓ‘fico de regresiΓ³n
sns.regplot(
data=filtered_price_reviews,
x='number_of_reviews',
y='price',
scatter_kws={'alpha': 0.4, 's': 50},
line_kws={'color': 'red'},
ax=axs[2]
)
axs[2].set_title('RegresiΓ³n lineal: Precio vs. NΓΊmero de ReseΓ±as (sin outliers)', fontsize=title_font)
axs[2].set_xlabel('NΓΊmero de ReseΓ±as', fontsize=label_font)
axs[2].set_ylabel('Precio ($)', fontsize=label_font)
axs[2].tick_params(labelsize=tick_font)
plt.tight_layout()
plt.show()
price_median_by_neighbourhood = price_neighbourhood.groupby('neighbourhood')['price'].median().reset_index()
price_median_by_neighbourhood.columns = ['neighbourhood', 'median_price']
price_median_by_neighbourhood['price_category'] = pd.qcut(
price_median_by_neighbourhood['median_price'],
q=3,
labels=['Bajo', 'Mediano', 'Alto']
)
reviews_by_neighbourhood = price_neighbourhood.groupby('neighbourhood')['number_of_reviews'].agg(['mean', 'median', 'count']).reset_index()
reviews_by_neighbourhood = reviews_by_neighbourhood.merge(price_median_by_neighbourhood, on='neighbourhood')
top_15_reviews = reviews_by_neighbourhood.sort_values('mean', ascending=False).head(15)
chart = alt.Chart(top_15_reviews).mark_bar(
stroke='black',
strokeWidth=1
).encode(
x=alt.X('mean:Q',
title='Promedio de ReseΓ±as',
scale=alt.Scale(nice=True)),
y=alt.Y('neighbourhood:N',
title='Barrio',
sort=alt.SortField(field='mean', order='descending')),
color=alt.Color('price_category:N',
scale=alt.Scale(
domain=['Bajo', 'Mediano', 'Alto'],
range=['#2ca02c', '#ffbf00', '#d62728']
),
legend=alt.Legend(title="CategorΓa de Precio")),
tooltip=[
alt.Tooltip('neighbourhood:N', title='Barrio'),
alt.Tooltip('mean:Q', title='Promedio de ReseΓ±as', format='.2f'),
alt.Tooltip('median:Q', title='Mediana de ReseΓ±as', format='.2f'),
alt.Tooltip('count:Q', title='Cantidad de Listados'),
alt.Tooltip('median_price:Q', title='Mediana de Precio', format='.2f'),
alt.Tooltip('price_category:N', title='CategorΓa de Precio')
]
).properties(
width=600,
height=400,
title=alt.TitleParams(
text='Top 15 Barrios con Mayor Promedio de ReseΓ±as por Alojamiento',
fontSize=16,
fontWeight='bold',
anchor='start'
)
).configure_axis(
labelFontSize=11,
titleFontSize=12
).configure_title(
fontSize=14,
fontWeight='bold'
)
chart
reviews_by_neighbourhood = price_neighbourhood.groupby('neighbourhood')['number_of_reviews'].agg(['mean', 'median', 'count']).reset_index()
reviews_by_neighbourhood = reviews_by_neighbourhood.sort_values('median', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(data=reviews_by_neighbourhood.head(15), y='neighbourhood', x='median')
plt.title('Top 15 Barrios con Mayor Mediana de ReseΓ±as por Alojamiento')
plt.xlabel('Mediana de ReseΓ±as')
plt.ylabel('Barrio')
plt.tight_layout()
plt.show()
# Calcular la mediana de precios por barrio
price_median_by_neighbourhood = price_neighbourhood.groupby('neighbourhood')['price'].median().reset_index()
price_median_by_neighbourhood.columns = ['neighbourhood', 'median_price']
# Crear 5 categorΓas de precios
price_median_by_neighbourhood['price_category'] = pd.qcut(
price_median_by_neighbourhood['median_price'],
q=5,
labels=['Muy Bajo', 'Bajo', 'Mediano', 'Alto', 'Muy Alto']
)
# Calcular tasa de renovaciΓ³n
review_stats = price_neighbourhood.groupby('neighbourhood')[['number_of_reviews', 'number_of_reviews_ltm']].sum().reset_index()
review_stats = review_stats[review_stats['number_of_reviews'] > 0].copy()
review_stats['renewal_rate'] = review_stats['number_of_reviews_ltm'] / review_stats['number_of_reviews']
# Unir con las categorΓas de precio
review_stats = review_stats.merge(price_median_by_neighbourhood, on='neighbourhood')
# Top 15 barrios con mayor tasa de renovaciΓ³n
top_renewing = review_stats.sort_values('renewal_rate', ascending=False).head(15)
# Crear grΓ‘fico con 5 categorΓas
chart = alt.Chart(top_renewing).mark_bar(
stroke='black',
strokeWidth=1
).encode(
x=alt.X('renewal_rate:Q', title='Tasa de RenovaciΓ³n (ΓΊltimo aΓ±o / total)', scale=alt.Scale(domain=[0, 1])),
y=alt.Y('neighbourhood:N', title='Barrio', sort=alt.SortField(field='renewal_rate', order='descending')),
color=alt.Color('price_category:N',
scale=alt.Scale(
domain=['Muy Bajo', 'Bajo', 'Mediano', 'Alto', 'Muy Alto'],
range=['#1a9850', '#91cf60', '#fee08b', '#fc8d59', '#d73027']
),
legend=alt.Legend(title='CategorΓa de Precio')),
tooltip=[
alt.Tooltip('neighbourhood:N', title='Barrio'),
alt.Tooltip('renewal_rate:Q', title='Tasa de RenovaciΓ³n', format='.2f'),
alt.Tooltip('number_of_reviews_ltm:Q', title='Reviews Γltimo AΓ±o'),
alt.Tooltip('number_of_reviews:Q', title='Reviews Totales'),
alt.Tooltip('median_price:Q', title='Mediana Precio', format='.0f'),
alt.Tooltip('price_category:N', title='CategorΓa Precio')
]
).properties(
width=650,
height=400,
title='Top 15 Barrios con Mayor Tasa de ReseΓ±as en el ΓΊltimo aΓ±o'
).configure_axis(
labelFontSize=11,
titleFontSize=12
).configure_title(
fontSize=14,
fontWeight='bold'
)
chart
# Calcular la mediana de precios por barrio
price_median_by_neighbourhood = price_neighbourhood.groupby('neighbourhood')['price'].median().reset_index()
price_median_by_neighbourhood.columns = ['neighbourhood', 'median_price']
# Crear 5 categorΓas de precio
price_median_by_neighbourhood['price_category'] = pd.qcut(
price_median_by_neighbourhood['median_price'],
q=5,
labels=['Muy Bajo', 'Bajo', 'Mediano', 'Alto', 'Muy Alto']
)
# Calcular estadΓsticas de reseΓ±as
review_stats = price_neighbourhood.groupby('neighbourhood')[['number_of_reviews', 'number_of_reviews_ltm']].sum().reset_index()
review_stats = review_stats[review_stats['number_of_reviews'] > 0].copy()
review_stats['renewal_rate'] = review_stats['number_of_reviews_ltm'] / review_stats['number_of_reviews']
# Unir con datos de precios
review_stats = review_stats.merge(price_median_by_neighbourhood, on='neighbourhood')
# Filtrar barrios con al menos 10 reseΓ±as
filtered_stats = review_stats[review_stats['number_of_reviews'] >= 10].copy()
# Crear grΓ‘fico de dispersiΓ³n
scatter_chart = alt.Chart(filtered_stats).mark_circle(
size=100,
stroke='white',
strokeWidth=1
).encode(
x=alt.X('median_price:Q',
title='Precio Mediano por Barrio ($)',
scale=alt.Scale(nice=True)),
y=alt.Y('renewal_rate:Q',
title='Tasa de RenovaciΓ³n de ReseΓ±as',
scale=alt.Scale(domain=[0, 1])),
color=alt.Color('price_category:N',
scale=alt.Scale(
domain=['Muy Bajo', 'Bajo', 'Mediano', 'Alto', 'Muy Alto'],
range=['#1a9850', '#91cf60', '#fee08b', '#fc8d59', '#d73027']
),
legend=alt.Legend(title='CategorΓa de Precio')),
size=alt.Size('number_of_reviews:Q',
scale=alt.Scale(range=[50, 400]),
legend=alt.Legend(title='Total de ReseΓ±as')),
tooltip=[
alt.Tooltip('neighbourhood:N', title='ποΈ Barrio'),
alt.Tooltip('median_price:Q', title='π° Precio Mediano', format='$.0f'),
alt.Tooltip('renewal_rate:Q', title='π Tasa de RenovaciΓ³n', format='.2f'),
alt.Tooltip('number_of_reviews_ltm:Q', title='π Reviews Γltimo AΓ±o'),
alt.Tooltip('number_of_reviews:Q', title='π Reviews Totales'),
alt.Tooltip('price_category:N', title='π·οΈ CategorΓa Precio')
]
).properties(
width=700,
height=500,
title=alt.TitleParams(
text='RelaciΓ³n entre Precio y Tasa de ReseΓ±as por Barrio',
subtitle='El tamaΓ±o del punto representa el total de reseΓ±as',
fontSize=16,
fontWeight='bold',
anchor='start'
)
).configure_axis(
labelFontSize=11,
titleFontSize=12,
grid=True,
gridOpacity=0.3
).configure_title(
fontSize=14,
fontWeight='bold'
)
# Mostrar el grΓ‘fico
scatter_chart
avail_price = df_clean.dropna(subset=['availability_365', 'price'])
avail_price = avail_price[avail_price['price'] > 0]
correlation = avail_price['availability_365'].corr(avail_price['price'])
print(f"\nCorrelaciΓ³n entre disponibilidad y precio: {correlation:.3f}")
avail_price['availability_category'] = pd.cut(
avail_price['availability_365'],
bins=[0, 90, 180, 270, 365],
labels=['Baja (0-90)', 'Media (91-180)', 'Alta (181-270)', 'Muy Alta (271-365)']
)
avail_stats = avail_price.groupby('availability_category')['price'].agg([
'mean', 'median', 'count'
]).round(2)
print("\n Precio promedio por nivel de disponibilidad:")
print(avail_stats)
print("\n INSIGHTS ADICIONALES:")
always_available = avail_price[avail_price['availability_365'] == 365]
never_available = avail_price[avail_price['availability_365'] == 0]
print(f"Listados disponibles todo el aΓ±o: {len(always_available)} ({len(always_available)/len(avail_price)*100:.1f}%)")
print(f"Listados no disponibles: {len(never_available)} ({len(never_available)/len(avail_price)*100:.1f}%)")
print(f"Precio promedio (disponibles todo el aΓ±o): ${always_available['price'].mean():.2f}")
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.scatter(avail_price['availability_365'].values, avail_price['price'].values, alpha=0.5)
plt.xlabel('Disponibilidad (dΓas al aΓ±o)')
plt.ylabel('Precio ($)')
plt.title(f'Disponibilidad vs Precio (r={correlation:.3f})')
plt.subplot(2, 2, 2)
sns.boxplot(data=avail_price, x='availability_category', y='price')
plt.xticks(rotation=45)
plt.title('Precio por CategorΓa de Disponibilidad')
plt.subplot(2, 2, 3)
plt.hist(avail_price['availability_365'].values, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('DΓas Disponibles al AΓ±o')
plt.ylabel('Frecuencia')
plt.title('DistribuciΓ³n de Disponibilidad')
plt.subplot(2, 2, 4)
avg_prices = avail_stats['mean']
sns.barplot(x=avg_prices.index.tolist(), y=avg_prices.values, palette='coolwarm')
plt.xticks(rotation=45)
plt.title('Precio Promedio por Nivel de Disponibilidad')
plt.ylabel('Precio Promedio ($)')
plt.tight_layout()
plt.show()
avail_stats_reset = avail_stats.reset_index()
avail_stats_reset['percentage'] = (avail_stats_reset['count'] / avail_stats_reset['count'].sum() * 100).round(1)
avail_stats_reset['houses_needed'] = avail_stats_reset['percentage'].round().astype(int)
isotype_data = []
house_id = 0
for _, row in avail_stats_reset.iterrows():
category = row['availability_category']
houses = int(row['houses_needed'])
count = row['count']
percentage = row['percentage']
price_avg = row['mean']
for i in range(houses):
col = house_id % 10
row_pos = house_id // 10
isotype_data.append({
'x': col,
'y': row_pos,
'category': category,
'count': count,
'percentage': percentage,
'price_avg': price_avg,
'house_id': house_id,
})
house_id += 1
isotype_df = pd.DataFrame(isotype_data)
isotype_data = []
emoji_mapping = {
'Baja (0-90)': 'π₯',
'Media (91-180)': 'π‘',
'Alta (181-270)': 'π’',
'Muy Alta (271-365)': 'ποΈ'
}
# Crear etiquetas que incluyan los emojis
isotype_df['emoji'] = isotype_df['category'].map(emoji_mapping)
isotype_df['category_with_emoji'] = isotype_df['emoji'] + ' ' + isotype_df['category']
house_chart = alt.Chart(isotype_df).mark_text(
fontSize=25,
baseline='middle',
align='center'
).encode(
x=alt.X('x:O', axis=None),
y=alt.Y('y:O', axis=None, sort='descending'),
text='emoji:N',
color=alt.Color('category_with_emoji:N',
scale=alt.Scale(
domain=['π₯ Baja (0-90)', 'π‘ Media (91-180)', 'π’ Alta (181-270)', 'ποΈ Muy Alta (271-365)'],
range=['#e74c3c', '#f39c12', '#f1c40f', '#27ae60']
),
legend=alt.Legend(
title="π Nivel de Disponibilidad",
orient="bottom",
columns=2,
titleFontSize=14,
labelFontSize=12,
symbolSize=100
)),
tooltip=[
alt.Tooltip('category:N', title='π Disponibilidad'),
alt.Tooltip('count:Q', title='π Propiedades', format=','),
alt.Tooltip('percentage:Q', title='π Porcentaje', format='.1f'),
alt.Tooltip('price_avg:Q', title='π° Precio Promedio', format='$.0f')
]
).properties(
width=500,
height=300,
title=alt.TitleParams(
text=[
'ποΈ DistribuciΓ³n de Disponibilidad - Airbnb Buenos Aires',
f'Cada casa = ~1% del total | Total: {avail_stats_reset["count"].sum():,} propiedades'
],
fontSize=16,
fontWeight='bold',
anchor='middle',
subtitleFontSize=12,
subtitleColor='#666666'
)
).configure_view(strokeWidth=0)
house_chart